In [1]:
import arcgis
from getpass import getpass
import pandas as pd
Create an GIS object instance using the account currently logged in through ArcGIS Pro
In [2]:
# gis_retail = arcgis.gis.GIS('Pro') # optional, use if can use ArcGIS Pro authentication
gis = arcgis.gis.GIS(
username='jmccune_retail',
password=getpass('Please enter the headless password: ')
)
Create a Web GIS Item instance using the Item ID
In [3]:
trade_area_itemid = '44c0b666404c41b8a14e04c34d09be1e'
item = arcgis.gis.Item(gis=gis, itemid=trade_area_itemid)
item
Out[3]:
Since the item only contains one feature layer, get the first layer in the item, the Feature Layer we need to work with.
In [4]:
feature_layer = item.layers[0]
feature_layer
Out[4]:
Use query to return the data as a Feature Set.
In [5]:
feature_set = feature_layer.query()
In [6]:
df_fields = pd.DataFrame([(field['name'], field['alias']) for field in feature_set.fields], columns=['NAME', 'ALIAS'])
df_fields
Out[6]:
Take advantage of the df
function on the Feature set object returned from the query to convert the data to a Pandas Data Frame.
In [7]:
df = feature_set.df
df.head()
Out[7]:
Because I am neurotic, and hate the prefixes added to the field names from the join when preparing the data in ArcMap, we are cleaning them up a little here.
In [8]:
df.columns = [field.replace('Target_Locations_', '').replace('overlayTemplate_', '') for field in df.columns]
df.head()
Out[8]:
Use KMeans cluster analysis from the Sci-Kit Learn package to segment the demographically similar stores together.
In [9]:
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
%matplotlib inline
Create the KMeans clusterer, specifying it to create five clusters, or segments.
In [10]:
km = KMeans(n_clusters=5)
Use the fit_predict
method to create the clusters using just the numeric fields - not including the OID and sales fields.
In [11]:
df_independent = df.select_dtypes(include=[np.number]).drop(['OBJECTID', 'SALESVOL'], axis=1)
fields_independent = df_independent.columns # save for later
field_name_dependent = 'SALESVOL' # keep track of this as well
In [12]:
df['segment'] = km.fit_predict(df_independent)
In [13]:
df.head()
Out[13]:
In [14]:
df_count = df.segment.value_counts().reset_index().rename(columns={'index': 'segment', 'segment': 'count'})
df_mean = df.groupby('segment').mean()[field_name_dependent].reset_index().rename(
columns={'segment': 'segment', field_name_dependent: 'mean_sales'})
df_median = df.groupby('segment').median()[field_name_dependent].reset_index().rename(
columns={'segment': 'segment', field_name_dependent: 'median_sales'})
In [15]:
df_summary = pd.merge(df_count, df_mean)
df_summary = pd.merge(df_summary, df_median)
df_summary = df_summary.sort_values('mean_sales', ascending=False)
df_summary['mean_sales_zscore'] = df_summary.apply(lambda row: (row.mean_sales - df.SALESVOL.mean()) / df.SALESVOL.std(ddof=0), axis=1)
print('Sales Mean: {:.2f}\nSales Standard Deviation: {:.2f}'.format(df.SALESVOL.mean(), df.SALESVOL.std()))
df_summary
Out[15]:
Now, using data from this data frame, visualize the means of each segment against the mean, and one-half standard deviation above and below the mean.
In [18]:
ax = df_summary.sort_values('mean_sales').plot.bar(y='mean_sales', x='segment', figsize=(15, 8))
ax.set_ylabel('mean sales')
ax.set_title('Mean Sales by KMeans Segment')
ax.axhline(y=df.SALESVOL.mean(), color='r') # add the mean for reference
ax.axhline(y=df.SALESVOL.mean() - df.SALESVOL.std() * 0.5, color='g')
ax.axhline(y=df.SALESVOL.mean() + df.SALESVOL.std() * 0.5, color='g')
Out[18]:
In [21]:
df['segment_3'] = df.segment == 3
df
Out[21]:
In [24]:
df.groupby("segment_3").CITY.value_counts()
Out[24]:
After doing this analysis, we are going to begin by showing the locations of the top performing stores on the map relative to the rest of the stores.
In [17]:
top_segment = df_summary.iloc[0].segment
top_segment
Out[17]:
This is a first stab at mapping the data just using the draw method, which plots the data on the map much like just using a sharpie. The data is not organized into a layer per say, so it is not very useful. Still, it is interesting at a coursory level, just seeing the rest of the stores, and the top performers.
In [26]:
store_map = gis.map('Meades Ranch, KS', 4)
store_map.basemap = 'gray'
store_map
In [19]:
for index, row in df[df.segment != top_segment].iterrows():
store_map.draw(row.SHAPE, symbol={'type': 'esriSMS', 'style': 'esriSMSCircle', 'size': 4})
for index, row in df[df.segment == top_segment].iterrows():
store_map.draw(row.SHAPE, symbol={'type': 'esriSMS', 'style': 'esriSMSCircle', "color":[0,255,0,255], "size":6})
In [18]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
In [41]:
pca = PCA(n_components=2)
In [50]:
df['x'] = pca.fit_transform(df[fields_independent])[:,0]
df['y'] = pca.fit_transform(df[fields_independent])[:,1]
In [61]:
ax = plt.scatter(data=df, x='x', y='y', c='segment')
cluster_centers = pca.transform(cluster.cluster_centers_)
cluster_centers = pd.DataFrame(cluster_centers, columns=['x', 'y'])
cluster_centers['segment'] = range(0, len(cluster_centers))
plt.plot(data=cluster_centers)
plt.show()
In [ ]: